This document explores a dataset containing information about individual rides made in a bike-sharing system covering the greater San Francisco Bay area from 2019-02-01 to 2019-02-28 with a total of 28 days, containing 183412 rows and 16 columns.
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
# Loading in the 'fordgobike-tripdata' dataset into a 'fgb' pandas dataframe.
fgb = pd.read_csv('fordgobike-tripdata.csv')
# High-level overview of data shape and composition.
print(fgb.shape)
print(fgb.dtypes)
print(fgb.head(10))
(183412, 16)
duration_sec int64
start_time object
end_time object
start_station_id float64
start_station_name object
start_station_latitude float64
start_station_longitude float64
end_station_id float64
end_station_name object
end_station_latitude float64
end_station_longitude float64
bike_id int64
user_type object
member_birth_year float64
member_gender object
bike_share_for_all_trip object
dtype: object
duration_sec start_time end_time \
0 52185 2019-02-28 17:32:10.1450 2019-03-01 08:01:55.9750
1 42521 2019-02-28 18:53:21.7890 2019-03-01 06:42:03.0560
2 61854 2019-02-28 12:13:13.2180 2019-03-01 05:24:08.1460
3 36490 2019-02-28 17:54:26.0100 2019-03-01 04:02:36.8420
4 1585 2019-02-28 23:54:18.5490 2019-03-01 00:20:44.0740
5 1793 2019-02-28 23:49:58.6320 2019-03-01 00:19:51.7600
6 1147 2019-02-28 23:55:35.1040 2019-03-01 00:14:42.5880
7 1615 2019-02-28 23:41:06.7660 2019-03-01 00:08:02.7560
8 1570 2019-02-28 23:41:48.7900 2019-03-01 00:07:59.7150
9 1049 2019-02-28 23:49:47.6990 2019-03-01 00:07:17.0250
start_station_id start_station_name \
0 21.0 Montgomery St BART Station (Market St at 2nd St)
1 23.0 The Embarcadero at Steuart St
2 86.0 Market St at Dolores St
3 375.0 Grove St at Masonic Ave
4 7.0 Frank H Ogawa Plaza
5 93.0 4th St at Mission Bay Blvd S
6 300.0 Palm St at Willow St
7 10.0 Washington St at Kearny St
8 10.0 Washington St at Kearny St
9 19.0 Post St at Kearny St
start_station_latitude start_station_longitude end_station_id \
0 37.789625 -122.400811 13.0
1 37.791464 -122.391034 81.0
2 37.769305 -122.426826 3.0
3 37.774836 -122.446546 70.0
4 37.804562 -122.271738 222.0
5 37.770407 -122.391198 323.0
6 37.317298 -121.884995 312.0
7 37.795393 -122.404770 127.0
8 37.795393 -122.404770 127.0
9 37.788975 -122.403452 121.0
end_station_name end_station_latitude \
0 Commercial St at Montgomery St 37.794231
1 Berry St at 4th St 37.775880
2 Powell St BART Station (Market St at 4th St) 37.786375
3 Central Ave at Fell St 37.773311
4 10th Ave at E 15th St 37.792714
5 Broadway at Kearny 37.798014
6 San Jose Diridon Station 37.329732
7 Valencia St at 21st St 37.756708
8 Valencia St at 21st St 37.756708
9 Mission Playground 37.759210
end_station_longitude bike_id user_type member_birth_year \
0 -122.402923 4902 Customer 1984.0
1 -122.393170 2535 Customer NaN
2 -122.404904 5905 Customer 1972.0
3 -122.444293 6638 Subscriber 1989.0
4 -122.248780 4898 Subscriber 1974.0
5 -122.405950 5200 Subscriber 1959.0
6 -121.901782 3803 Subscriber 1983.0
7 -122.421025 6329 Subscriber 1989.0
8 -122.421025 6548 Subscriber 1988.0
9 -122.421339 6488 Subscriber 1992.0
member_gender bike_share_for_all_trip
0 Male No
1 NaN No
2 Male No
3 Other No
4 Male Yes
5 Male No
6 Female No
7 Male No
8 Other No
9 Male No
# Descriptive statistics for numeric variables
print(fgb.describe())
duration_sec start_station_id start_station_latitude \
count 183412.000000 183215.000000 183412.000000
mean 726.078435 138.590427 37.771223
std 1794.389780 111.778864 0.099581
min 61.000000 3.000000 37.317298
25% 325.000000 47.000000 37.770083
50% 514.000000 104.000000 37.780760
75% 796.000000 239.000000 37.797280
max 85444.000000 398.000000 37.880222
start_station_longitude end_station_id end_station_latitude \
count 183412.000000 183215.000000 183412.000000
mean -122.352664 136.249123 37.771427
std 0.117097 111.515131 0.099490
min -122.453704 3.000000 37.317298
25% -122.412408 44.000000 37.770407
50% -122.398285 100.000000 37.781010
75% -122.286533 235.000000 37.797320
max -121.874119 398.000000 37.880222
end_station_longitude bike_id member_birth_year
count 183412.000000 183412.000000 175147.000000
mean -122.352250 4472.906375 1984.806437
std 0.116673 1664.383394 10.116689
min -122.453704 11.000000 1878.000000
25% -122.411726 3777.000000 1980.000000
50% -122.398279 4958.000000 1987.000000
75% -122.288045 5502.000000 1992.000000
max -121.874119 6645.000000 2001.000000
# Converting the (start_time, end_time) columns from object to datetime using the to_datetime() method.
fgb['start_time'] = pd.to_datetime(fgb['start_time'])
fgb['end_time'] = pd.to_datetime(fgb['end_time'])
fgb.dtypes
duration_sec int64 start_time datetime64[ns] end_time datetime64[ns] start_station_id float64 start_station_name object start_station_latitude float64 start_station_longitude float64 end_station_id float64 end_station_name object end_station_latitude float64 end_station_longitude float64 bike_id int64 user_type object member_birth_year float64 member_gender object bike_share_for_all_trip object dtype: object
# Adding day and hour columns to fgb dataframe.
fgb['stday_name'] = fgb.start_time.dt.day_name()
fgb['stday_num'] = fgb.start_time.dt.day
fgb['sthour'] = fgb.start_time.dt.hour
fgb['enday_name'] = fgb.end_time.dt.day_name()
fgb['enday_num'] = fgb.end_time.dt.day
fgb['enhour'] = fgb.end_time.dt.hour
# Creating a new column for the members age in 2019.
fgb['member_age_2019'] = fgb.end_time.dt.year.max() - fgb['member_birth_year']
fgb.head(3)
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | member_birth_year | member_gender | bike_share_for_all_trip | stday_name | stday_num | sthour | enday_name | enday_num | enhour | member_age_2019 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | 1984.0 | Male | No | Thursday | 28 | 17 | Friday | 1 | 8 | 35.0 |
| 1 | 42521 | 2019-02-28 18:53:21.789 | 2019-03-01 06:42:03.056 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | ... | NaN | NaN | No | Thursday | 28 | 18 | Friday | 1 | 6 | NaN |
| 2 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | ... | 1972.0 | Male | No | Thursday | 28 | 12 | Friday | 1 | 5 | 47.0 |
3 rows × 23 columns
# Dropping the columns ('member_birth_year', 'start_station_latitude', 'start_station_longitude',
# 'end_station_latitude', 'end_station_longitude').
fgb.drop(columns=['member_birth_year', 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude'],
inplace=True)
fgb.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'end_station_id', 'end_station_name', 'bike_id',
'user_type', 'member_gender', 'bike_share_for_all_trip', 'stday_name',
'stday_num', 'sthour', 'enday_name', 'enday_num', 'enhour',
'member_age_2019'],
dtype='object')
# Searching for null values.
fgb.isnull().sum()
duration_sec 0 start_time 0 end_time 0 start_station_id 197 start_station_name 197 end_station_id 197 end_station_name 197 bike_id 0 user_type 0 member_gender 8265 bike_share_for_all_trip 0 stday_name 0 stday_num 0 sthour 0 enday_name 0 enday_num 0 enhour 0 member_age_2019 8265 dtype: int64
# Removing the null values from the fgb dataframe.
fgb.dropna(inplace= True)
fgb.isna().sum()
duration_sec 0 start_time 0 end_time 0 start_station_id 0 start_station_name 0 end_station_id 0 end_station_name 0 bike_id 0 user_type 0 member_gender 0 bike_share_for_all_trip 0 stday_name 0 stday_num 0 sthour 0 enday_name 0 enday_num 0 enhour 0 member_age_2019 0 dtype: int64
# Checking for duplicates in fgb dataframe.
fgb.duplicated().sum()
0
# Checking the number of unique values in each column in the fgb dataframe.
fgb.nunique()
duration_sec 4429 start_time 174941 end_time 174939 start_station_id 329 start_station_name 329 end_station_id 329 end_station_name 329 bike_id 4607 user_type 2 member_gender 3 bike_share_for_all_trip 2 stday_name 7 stday_num 28 sthour 24 enday_name 7 enday_num 28 enhour 24 member_age_2019 75 dtype: int64
Found that:
# I noticed that the unique values in "member_gender" are three, which needs further investigation.
fgb.member_gender.value_counts()
Male 130500 Female 40805 Other 3647 Name: member_gender, dtype: int64
fgb_Male_Sub_Bikesh = fgb.loc[(fgb['member_gender']=='Male') & (fgb['bike_share_for_all_trip']== 'No')
& (fgb['user_type']== 'Subscriber')]
fgb_Male_Sub_Bikesh.shape
(105903, 18)
Observed
# More investigation for the "member_age_2019" column.
fgb.member_age_2019.describe()
count 174952.000000 mean 34.196865 std 10.118731 min 18.000000 25% 27.000000 50% 32.000000 75% 39.000000 max 141.000000 Name: member_age_2019, dtype: float64
Observed
# Convert user_type, member_gender, bike_share_for_all_trip, stday_name, enday_name into ordered categorical types.
ordinal_var_dict = {'user_type': ['Subscriber', 'Customer'],
'member_gender': ['Male', 'Female', 'Other'],
'bike_share_for_all_trip': ['No', 'Yes'],
'stday_name': ['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'],
'enday_name': ['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']}
for var in ordinal_var_dict:
ordered_var = pd.api.types.CategoricalDtype(ordered = True,
categories = ordinal_var_dict[var])
fgb[var] = fgb[var].astype(ordered_var)
# Checking the data descriptive statistics for numeric variables after the cleaning process.
fgb.describe()
| duration_sec | start_station_id | end_station_id | bike_id | stday_num | sthour | enday_num | enhour | member_age_2019 | |
|---|---|---|---|---|---|---|---|---|---|
| count | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 |
| mean | 704.002744 | 139.002126 | 136.604486 | 4482.587555 | 15.312337 | 13.456165 | 15.311714 | 13.609533 | 34.196865 |
| std | 1642.204905 | 111.648819 | 111.335635 | 1659.195937 | 8.033926 | 4.734282 | 8.034127 | 4.748029 | 10.118731 |
| min | 61.000000 | 3.000000 | 3.000000 | 11.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 18.000000 |
| 25% | 323.000000 | 47.000000 | 44.000000 | 3799.000000 | 8.000000 | 9.000000 | 8.000000 | 9.000000 | 27.000000 |
| 50% | 510.000000 | 104.000000 | 101.000000 | 4960.000000 | 15.000000 | 14.000000 | 15.000000 | 14.000000 | 32.000000 |
| 75% | 789.000000 | 239.000000 | 238.000000 | 5505.000000 | 22.000000 | 17.000000 | 22.000000 | 18.000000 | 39.000000 |
| max | 84548.000000 | 398.000000 | 398.000000 | 6645.000000 | 28.000000 | 23.000000 | 28.000000 | 23.000000 | 141.000000 |
Observed
# Checking the data columns, types, and shape after the cleaning process.
print(fgb.columns)
print(fgb.dtypes)
print(fgb.shape)
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'end_station_id', 'end_station_name', 'bike_id',
'user_type', 'member_gender', 'bike_share_for_all_trip', 'stday_name',
'stday_num', 'sthour', 'enday_name', 'enday_num', 'enhour',
'member_age_2019'],
dtype='object')
duration_sec int64
start_time datetime64[ns]
end_time datetime64[ns]
start_station_id float64
start_station_name object
end_station_id float64
end_station_name object
bike_id int64
user_type category
member_gender category
bike_share_for_all_trip category
stday_name category
stday_num int64
sthour int64
enday_name category
enday_num int64
enhour int64
member_age_2019 float64
dtype: object
(174952, 18)
There are 174952 rows in the dataset (after the cleaning process) with 18 features.
The 18 features columns are:
['duration_sec', 'start_time', 'end_time', 'start_station_id', 'start_station_name', 'end_station_id', 'end_station_name', 'bike_id', 'user_type', 'member_gender', 'bike_share_for_all_trip', 'stday_name', 'stday_num', 'sthour', 'enday_name', 'enday_num', 'enhour', 'member_age_2019'].
Most variables are numeric in nature, but the variables (start_station_name, end_station_name) are nominal variables, (user_type, member_gender, bike_share_for_all_trip, stday_name, enday_name) are categorical variables, and (start_time, end_time) are datetime variables.
I am more interested in discovering which features are better to predict with:
I expect that:
['duration_sec', 'start_station_name', 'end_station_name', 'bike_id', 'user_type', 'member_gender', 'bike_share_for_all_trip', 'stday_name', 'stday_num', 'sthour', 'member_age_2019']
will have the strongest effect on my investigation.
I'll start by looking at the distribution of the main variable of interest:
['duration_sec', 'start_station_name', 'end_station_name', 'bike_id', 'user_type', 'member_gender', 'bike_share_for_all_trip', 'stday_name', 'stday_num', 'sthour', 'member_age_2019']
fgb.columns
Index(['duration_sec', 'start_time', 'end_time', 'start_station_id',
'start_station_name', 'end_station_id', 'end_station_name', 'bike_id',
'user_type', 'member_gender', 'bike_share_for_all_trip', 'stday_name',
'stday_num', 'sthour', 'enday_name', 'enday_num', 'enhour',
'member_age_2019'],
dtype='object')
# Creating a sample of 500 rides to use it when needed so that plots are clearer and they render faster.
samples = np.random.choice(fgb.shape[0], 500, replace = False)
fgb_samp = fgb.iloc[samples,:]
# Start by looking at the distribution of the duration_sec variable with a standard-scaled plot.
binsize = 100
bins = np.arange(61, fgb['duration_sec'].max()+binsize, binsize)
plt.figure(figsize=[20, 5])
plt.hist(data = fgb, x = 'duration_sec', bins = bins)
plt.xlabel('Ride Duration in Sec')
plt.ylabel('Count')
plt.title('The Distribution of the duration_sec Variable')
plt.xlim(0, 2500)
plt.show()
Observed
# Removing outliers that caused the high difference in the descriptive data for the duration_sec column.
fgb = fgb.query('duration_sec < 8000')
fgb.shape
(174478, 18)
# Checking the descriptive data for the duration_sec column after removing the outliers.
fgb.duration_sec.describe()
count 174478.000000 mean 641.954069 std 554.830569 min 61.000000 25% 322.000000 50% 509.000000 75% 785.000000 max 7958.000000 Name: duration_sec, dtype: float64
Observed:
# Display the total number of rides made each day in descending order.
rides_stday_name = fgb.stday_name.value_counts()
rides_stday_name.plot(kind='bar');
plt.xticks(rotation=15);
plt.xlabel('Day Name')
plt.ylabel('Count')
plt.title('Total Number of Rides Made Each Day');
Observed:
# Display the total number of rides made each day in descending order.
rides_stday_num = fgb.stday_num.value_counts()
rides_stday_num.plot(kind='bar');
plt.xticks(rotation=15);
plt.xlabel('Day Number')
plt.ylabel('Count')
plt.title('Total Number of Rides Made Each Day');
Observed:
The highest days in the total number of rides are 28, 20, 21, and 19 respectively.
The lowest days in the total number of rides are 9, 3, 2, and 13 respectively.
Needs more investigation
# Displaying the total number of rides done each start hour in descending order.
rides_sthour = fgb.sthour.value_counts()
rides_sthour.plot(kind='bar')
plt.xticks(rotation=15)
plt.xlabel('Start Hours')
plt.ylabel('Rides Count')
plt.title('Count of Rides by Start Hours');
# Plotting total number of rides by hour.
plt.figure(figsize=[20, 5])
bins = np.arange(0, 24, 1)
fgb.sthour.plot(kind='hist', bins=bins)
plt.xticks(np.arange(0, 24, step=1));
plt.xlabel('Hour')
plt.ylabel('Count')
plt.title('Total Number of Rides by Hour');
Observed:
# Displaying the total number of rides done each end hour in descending order.
rides_enhour = fgb.enhour.value_counts()
rides_enhour.plot(kind='bar')
plt.xticks(rotation=15)
plt.xlabel('End Hours')
plt.ylabel('Rides Count')
plt.title('Count of Rides by End Hours');
Observed:
# Showing the start stations with the highest number of rides.
plt.figure(figsize=[20, 5])
fgb['start_station_name'].value_counts().iloc[0:50].plot(kind= 'bar');
plt.xlabel('Station Name')
plt.ylabel('Count')
plt.title('Start Stations with the Highest Number of Rides');
Observed:
# Showing the end stations with the highest number of rides.
plt.figure(figsize=[20, 5])
fgb['end_station_name'].value_counts().iloc[0:50].plot(kind= 'bar');
plt.xlabel('Station Name')
plt.ylabel('Count')
plt.title('End Stations with the Highest Number of Rides');
Observed:
# Showing the start stations with the lowest number of rides.
plt.figure(figsize=[20, 5])
fgb['start_station_name'].value_counts().iloc[-50:].plot(kind= 'bar');
plt.xlabel('Station Name')
plt.ylabel('Count')
plt.title('Start Stations with the Lowest Number of Rides');
Observed:
Needs more investigation
# Showing the end stations with the lowest number of rides.
plt.figure(figsize=[20, 5])
fgb['end_station_name'].value_counts().iloc[-50:].plot(kind= 'bar');
plt.xlabel('Station Name')
plt.ylabel('Count')
plt.title('End Stations with the Lowest Number of Rides');
Observed:
# Showing the number of rides taken by each bike.
plt.figure(figsize=[20, 5])
fgb.bike_id.value_counts().plot(kind = 'bar');
plt.xticks([]);
plt.xlabel('Bike Id')
plt.ylabel('Count')
plt.title('Number of Rides Taken by Each Bike');
# Showing bike_id Value Counts.
plt.figure(figsize=[20, 5])
fgb.bike_id.value_counts(bins=10).plot(kind='bar');
plt.xlabel('Bike Id')
plt.ylabel('Count')
plt.title('Bike Id Value Counts');
Observed:
# Plotting total number of rides by bike_id.
plt.figure(figsize=[20, 5])
fgb.bike_id.plot(kind="hist");
plt.xlabel('Bike Id')
plt.ylabel('Count')
plt.title('Total Number of Rides by Bike Id');
Observed:
Needs more investigation
# Ratio of rides taken by Subscribers.
len(fgb[fgb['user_type'] == 'Subscriber']) / len(fgb)
0.906332030399248
# Total number of rides categorized by user type.
fgb.user_type.value_counts().plot(kind = 'bar');
plt.xlabel('User Type')
plt.ylabel('Count')
plt.title('Total Number of Rides by User Type');
Observed:
fgb.member_age_2019.describe()
count 174478.000000 mean 34.194500 std 10.119272 min 18.000000 25% 27.000000 50% 32.000000 75% 39.000000 max 141.000000 Name: member_age_2019, dtype: float64
# Checking for age outliers.
plt.figure(figsize=[20, 5])
fgb.member_age_2019.value_counts().plot(kind = 'bar');
plt.ylim(0, 500)
plt.xlabel('Member Age')
plt.ylabel('Count')
plt.title('Checking for Age Outliers');
Observed
# Removing rides taken by members ages greater than 85.
fgb = fgb.query('member_age_2019 < 85')
fgb.shape
(174289, 18)
# Member age describtive statistics.
fgb.member_age_2019.describe()
count 174289.000000 mean 34.123467 std 9.879442 min 18.000000 25% 27.000000 50% 32.000000 75% 39.000000 max 81.000000 Name: member_age_2019, dtype: float64
# Total rides taken by user age in 2019 after removing outliers.
plt.figure(figsize=[20, 5])
fgb.member_age_2019.value_counts().plot(kind = 'bar');
plt.xlabel('Member Age')
plt.ylabel('Count')
plt.title('Total Rides Taken by Member Age in 2019');
Observed:
Needs more investigation
# Showing the member gender categories and their values.
fgb.member_gender.value_counts()
Male 130050 Female 40625 Other 3614 Name: member_gender, dtype: int64
Observed:
# Showing the ratio of each type of users Male, Female, and Other.
print(len(fgb[fgb['member_gender'] == 'Male']) / len(fgb))
print(len(fgb[fgb['member_gender'] == 'Female']) / len(fgb))
print(len(fgb[fgb['member_gender'] == 'Other']) / len(fgb))
0.7461744573667873 0.23308986797789877 0.020735674655313875
# Plotting the number of rides made by gender type.
fgb.member_gender.value_counts().plot(kind = 'bar');
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Number of Rides Made by Gender');
Observed:
# Comparison chart between users who use bike sharing for all rides and users who don't.
fgb.bike_share_for_all_trip.value_counts().plot(kind = 'bar');
plt.xlabel('Bike Share for All Trips')
plt.ylabel('Count')
plt.title('Bike Sharing for All Rides Yes vs No');
Observed:
Needs more investigation
To start off with, I want to look at the pairwise correlations present between features in the data.
num_vars = ['duration_sec', 'member_age_2019', 'stday_num', 'enday_num', 'sthour', 'enhour',
'start_station_id', 'end_station_id', 'bike_id']
#cat_vars = ['start_station_name', 'end_station_name', 'user_type', 'member_gender', 'bike_share_for_all_trip']
cat_vars = ['user_type', 'member_gender', 'bike_share_for_all_trip']
# I want to look at the pairwise correlations present between features in the data.
g = sb.PairGrid(data = fgb, vars = num_vars)
g = g.map_diag(plt.hist, bins = 20);
g.map_offdiag(plt.scatter);
# Correlation plot
plt.figure(figsize = [8, 5])
sb.heatmap(fgb[num_vars].corr(), annot = True, fmt = '.3f',
cmap = 'vlag_r', center = 0)
plt.show()
Observed:
Plan on investigating next:
# plot matrix of numeric features against categorical features.
# can use a larger sample since there are fewer plots and they're simpler in nature.
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
y_vars=['duration_sec', 'start_station_id', 'end_station_id', 'bike_id', 'stday_num', 'sthour', 'member_age_2019']
def boxgrid(x, y, **kwargs):
""" Quick hack for creating box plots with seaborn's PairGrid. """
default_color = sb.color_palette()[0]
sb.boxplot(x, y, color = default_color);
plt.figure(figsize = [10, 10])
g = sb.PairGrid(data=fgb_samp, y_vars=y_vars, x_vars=cat_vars, height =3, aspect =1.5);
g.map(boxgrid);
plt.show();
<Figure size 720x720 with 0 Axes>
Observed
# Finally, let's look at relationships between the four categorical features.
plt.figure(figsize = [10, 15])
# subplot 1: 'user_type' vs 'member_gender'
plt.subplot(6, 1, 1)
sb.countplot(data = fgb, x = 'user_type', hue = 'member_gender', palette = 'Blues')
# subplot 2: 'user_type' vs. 'bike_share_for_all_trip'
ax = plt.subplot(6, 1, 2)
sb.countplot(data = fgb, x = 'user_type', hue = 'bike_share_for_all_trip', palette = 'Blues')
ax.legend(loc = 1, ncol = 2) # re-arrange legend to remove overlapping
# subplot 3: 'user_type' vs. 'stday_name'
ax = plt.subplot(6, 1, 3)
sb.countplot(data = fgb, x = 'user_type', hue = 'stday_name', palette = 'Blues')
ax.legend(loc = 1, ncol = 2)
# subplot 4: 'member_gender' vs 'bike_share_for_all_trip'
plt.subplot(6, 1, 4)
sb.countplot(data = fgb, x = 'member_gender', hue = 'bike_share_for_all_trip', palette = 'Blues')
# subplot 5: 'member_gender' vs. 'stday_name'
ax = plt.subplot(6, 1, 5)
sb.countplot(data = fgb, x = 'member_gender', hue = 'stday_name', palette = 'Blues')
ax.legend(ncol = 2)
# subplot 6: 'bike_share_for_all_trip' vs. 'stday_name'
ax = plt.subplot(6, 1, 6)
sb.countplot(data = fgb, x = 'bike_share_for_all_trip', hue = 'stday_name', palette = 'Blues')
ax.legend(loc = 1, ncol = 2)
plt.show()
Observed
-There doesn't seem to be that much interaction between the categories variables above, though proportionally it seems like there might be more rides for Males, Subscribers, No bike_share_for_all_trip users.
With the preliminary look at bivariate relationships out of the way, I want to dig into some of the relationships more. First, I want to see how 'bike_id' and ('stday_num', 'enday_num') are related to one another for all of the data.
# Showing how 'bike_id' and 'enday_num' are related to one another for all of the data.
plt.figure(figsize=[20, 5])
base_color = sb.color_palette()[0]
sb.violinplot(data=fgb, y='bike_id', x='enday_num', inner='quartile', color=base_color);
plt.xlabel('End Day Number')
plt.ylabel('Bike Id')
plt.title('Relation between Bike Id and End Day Number');
# Showing how 'bike_id' and 'stday_num' are related to one another for a data sample.
sb.regplot(data=fgb_samp, y='bike_id', x='stday_num');
plt.xlabel('Start Day Number')
plt.ylabel('Bike Id')
plt.title('Relation between Bike Id and Start Day Number');
Observed
bins = np.arange(0, 6100, 100)
g = sb.FacetGrid(data=fgb_samp, col= 'start_station_name', col_wrap=3, height=5, aspect=2);
g.map_dataframe(sb.histplot, x=fgb.bike_id, bins=bins);
g.set_xlabels('Bike Id')
g.set_ylabels('Count');
Observed:
# Showing how 'sthour' and 'enhour' are related to one another for all of the data.
plt.scatter(data=fgb, x='sthour', y='enhour');
plt.xlabel('Start Hour')
plt.ylabel('End Hour')
plt.title('The Relation Between Start Hour and End Hour');
Observed
plt.figure(figsize=[20, 5])
base_color = sb.color_palette()[0]
sb.boxplot(data=fgb, x='user_type', y='duration_sec', color=base_color);
#plt.xticks(rotation=90);
plt.ylim(0, 3000);
plt.xlabel('User Type')
plt.ylabel('Duration in Seconds')
plt.title('User Type vs Duration in Seconds');
Observed:
# Bike Share for All Trips vs Member Age in 2019.
plt.figure(figsize=[20, 5])
base_color = sb.color_palette()[0]
sb.boxplot(data=fgb, x='bike_share_for_all_trip', y='member_age_2019', color=base_color);
plt.ylim(0, 70);
plt.xlabel('Bike Share for All Trips')
plt.ylabel('Member Age in 2019')
plt.title('Bike Share for All Trips vs Member Age in 2019');
Observed:
# Bike Share for All Trips vs Duration in Seconds.
plt.figure(figsize=[20, 5])
base_color = sb.color_palette()[0]
sb.boxplot(data=fgb, x='bike_share_for_all_trip', y='duration_sec', color=base_color);
plt.ylim(0, 2000);
plt.xlabel('Bike Share for All Trips')
plt.ylabel('Duration in Seconds')
plt.title('Bike Share for All Trips vs Duration in Seconds');
Observed:
# Preparing the data for Creating a Heatmap of start_station_name vs user_type.
counts = fgb.groupby(['start_station_name', 'user_type']).size().reset_index(name = 'count')
counts = counts.pivot(index= 'start_station_name', columns= 'user_type', values= 'count')
counts_subscriber = counts.sort_values('Subscriber', ascending=False).iloc[:10]
counts_customer = counts.sort_values('Customer', ascending=False).iloc[:10]
# Creating Heatmap of start_station_name vs user_type.
sb.heatmap(counts_subscriber, annot=True, fmt='d', cmap='viridis_r');
plt.xlabel('User Type')
plt.ylabel('Station Name')
plt.title('Heatmap of Start Station Name vs User Type Sorted Descending by Subscriber Count');
# Creating Heatmap of start_station_name vs user_type.
sb.heatmap(counts_customer, annot=True, fmt='d', cmap='viridis_r');
plt.xlabel('User Type')
plt.ylabel('Station Name')
plt.title('Heatmap of Start Station Name vs User Type Sorted Descending by Customer Count');
# Showing the relationship between start station name and the count of user types (Customer, Subscriber).
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='start_station_name', hue='user_type');
plt.xticks([]);
plt.ylabel('Count')
plt.xlabel('Station Name')
plt.title('Start Station Name vs User Type');
Observed:
# Preparing the data for Creating a Heatmap of start_station_name vs member_gender.
counts_gender = fgb.groupby(['start_station_name', 'member_gender']).size().reset_index(name = 'count')
counts_gender = counts_gender.pivot(index= 'start_station_name', columns= 'member_gender', values= 'count')
counts_male = counts_gender.sort_values('Male', ascending=False).iloc[:10]
counts_female = counts_gender.sort_values('Female', ascending=False).iloc[:10]
counts_other = counts_gender.sort_values('Other', ascending=False).iloc[:10]
# Creating Heatmap of start_station_name vs member_gender.
sb.heatmap(counts_male, annot=True, fmt='d', cmap='viridis_r');
plt.xlabel('Member Gender')
plt.ylabel('Station Name')
plt.title('Heatmap of Start Station Name vs Member Gender Sorted Descending by Male Count');
# Creating Heatmap of start_station_name vs member_gender.
sb.heatmap(counts_female, annot=True, fmt='d', cmap='viridis_r');
plt.xlabel('Member Gender')
plt.ylabel('Station Name')
plt.title('Heatmap of Start Station Name vs Member Gender Sorted Descending by Female Count');
# Creating Heatmap of start_station_name vs member_gender.
sb.heatmap(counts_other, annot=True, fmt='d', cmap='viridis_r');
plt.xlabel('Member Gender')
plt.ylabel('Station Name')
plt.title('Heatmap of Start Station Name vs Member Gender Sorted Descending by Other Count');
# start_station_name vs member_gender.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='start_station_name', hue='member_gender');
plt.xticks([]);
plt.xlabel('Count')
plt.xlabel('Station Name')
plt.title('Start Station Name vs Member Gender');
Observed:
# The relation between start_station_name and bike_share_for_all_trip.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='start_station_name', hue='bike_share_for_all_trip');
plt.xticks([]);
plt.xlim(0, 100);
plt.xlabel('Station Name')
plt.ylabel('Count')
plt.title('Start Station Name vs Bike Share for All Trips');
Observed:
# The relation between sthour and bike_share_for_all_trip.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='sthour', hue='bike_share_for_all_trip');
plt.xlabel('Start Hour')
plt.ylabel('Count')
plt.title('Start Hour vs Bike Share for All Trips');
Observed:
# The relation between stday_name and bike_share_for_all_trip.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='stday_name', hue='bike_share_for_all_trip');
plt.xlabel('Day Name')
plt.ylabel('Count')
plt.title('Start Day Name vs Bike Share for All Trips');
Observed:
# The relation between sthour and member_gender.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='sthour', hue='member_gender');
plt.xlabel('Start Hour')
plt.ylabel('Count')
plt.title('Start Hour vs Member Gender');
Observed:
# The relation between stday_name and member_gender.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='stday_name', hue='member_gender');
plt.xlabel('Day Name')
plt.ylabel('Count')
plt.title('Start Day Name vs Member Gender');
Observed:
# The relation between sthour and user_type.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='sthour', hue='user_type');
plt.xlabel('Start Hour')
plt.ylabel('Count')
plt.title('Start Hour vs User Type');
Observed:
# The relation between stday_name and user_type.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='stday_name', hue='user_type');
plt.xlabel('Day Name')
plt.ylabel('Count')
plt.title('Start Day Name vs User Type');
Observed:
# The relation between user_type and member_gender.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='user_type', hue='member_gender');
plt.xlabel('User Type')
plt.ylabel('Count')
plt.title('User Type vs Member Gender');
Observed:
# The relation between user_type and bike_share_for_all_trip.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='user_type', hue='bike_share_for_all_trip');
plt.xlabel('User Type')
plt.ylabel('Count')
plt.title('User Type vs Bike Share for All Trips');
Observed:
# The relation between member_gender and bike_share_for_all_trip.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='member_gender', hue='bike_share_for_all_trip');
plt.xlabel('Member Gender')
plt.ylabel('Count')
plt.title('Member Gender vs Bike Share for All Trips');
Observed:
# The relation between member_gender and user_type.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='member_gender', hue='user_type');
plt.xlabel('Member Gender')
plt.ylabel('Count')
plt.title('Member Gender vs User Type');
# The relation between user_type and bike_share_for_all_trip.
plt.figure(figsize= [20, 5])
sb.countplot(data=fgb, x='user_type', hue='bike_share_for_all_trip');
plt.xlabel('User Type')
plt.ylabel('Count')
plt.title('User Type vs Bike Share for All Trips');
Observed
bins = np.arange(0, 6100, 100)
g = sb.FacetGrid(data=fgb, col= 'member_age_2019', col_wrap=3, height=5, aspect=2);
g.map_dataframe(sb.histplot, x='bike_id', bins=bins);
g.set_xlabels('Bike Id')
g.set_ylabels('Count');
Observed:
The main thing I want to explore in this part of the analysis is how the categorical variables of 'user_type', 'member_gender', 'bike_share_for_all_trip', 'stday_name' play into the relationship between 'duration_sec', 'stday_num', and 'member_age_2019'.
# Creating a heatmap that shows the correlation coefficient between numerical variables in fgb.
sb.heatmap(fgb[num_vars].corr(), cmap='rocket_r', annot=True, fmt='.2f');
plt.xlabel('Numerical Vars')
plt.ylabel('Numerical Vars')
plt.title('Correlation Coefficient between Numerical Variables in fgb Heatmap');
# The relation between stday_name, duration_sec, and member_age_2019.
plt.scatter(data=fgb, x='stday_name', y='duration_sec', c='member_age_2019', cmap='viridis_r', alpha=0.5);
plt.colorbar(label='Members Ages in 2019');
plt.xticks(rotation=30);
plt.xlabel('Day Name')
plt.ylabel('Duration in Seconds')
plt.title('Day Name vs Members Ages in 2019 vs Duration in Seconds');
# The relation between member_age_2019, duration_sec, and stday_name.
sb.scatterplot(data=fgb_samp, hue='stday_name', y='duration_sec', x='member_age_2019');
plt.xticks(rotation=30);
plt.xlabel('Member Age in 2019')
plt.ylabel('Duration in Seconds')
plt.title('Relation between Member Age in 2019, Duration in Seconds, and Start Day Name');
# The relation between member_age_2019, duration_sec, and stday_name.
g = sb.FacetGrid(fgb, col='stday_name')
g.map_dataframe(sb.scatterplot, x='member_age_2019', y='duration_sec');
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
# The relation between member_age_2019, duration_sec, and stday_name.
g= sb.FacetGrid(data=fgb, col='stday_name')
g.map(plt.scatter, 'member_age_2019', 'duration_sec');
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between member_age_2019, duration_sec, and sthour.
g= sb.FacetGrid(data=fgb, col='sthour', col_wrap=6)
g.map(sb.regplot, 'member_age_2019', 'duration_sec');
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between stday_name, user_type, member_age_2019, and duration_sec.
g= sb.FacetGrid(data=fgb, col='stday_name', row='user_type', margin_titles=True)
g.map_dataframe(plt.hist2d, 'member_age_2019', 'duration_sec', cmin=0.5, cmap='viridis_r');
plt.colorbar();
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between stday_name, user_type, member_age_2019, and duration_sec.
g= sb.FacetGrid(data=fgb, col='stday_name', row='user_type', margin_titles=True)
g.map_dataframe(plt.scatter, 'member_age_2019', 'duration_sec');
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between stday_name, user_type, member_age_2019, and duration_sec.
g = sb.FacetGrid(fgb, col='stday_name')
g.map_dataframe(sb.scatterplot, x='member_age_2019', y='duration_sec', hue='user_type');
g.add_legend();
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between stday_name, user_type, member_age_2019, and duration_sec.
g = sb.FacetGrid(fgb, col='stday_name', col_wrap=1, height=5, aspect=2)
g.map_dataframe(sb.pointplot, x='member_age_2019', y='duration_sec', hue='user_type');
g.add_legend();
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between stday_name, bike_share_for_all_trip, member_age_2019, and duration_sec.
g = sb.FacetGrid(fgb, col='stday_name')
g.map_dataframe(sb.scatterplot, x='member_age_2019', y='duration_sec', hue='bike_share_for_all_trip');
g.add_legend();
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between stday_name, bike_share_for_all_trip, member_age_2019, and duration_sec.
g = sb.FacetGrid(fgb, col='stday_name', col_wrap=1, height=5, aspect=2)
g.map_dataframe(sb.pointplot, x='member_age_2019', y='duration_sec', hue='bike_share_for_all_trip');
plt.ylim(0, 10000)
g.add_legend();
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
# The relation between stday_name, member_gender, member_age_2019, and duration_sec.
g = sb.FacetGrid(fgb, col='stday_name')
g.map_dataframe(sb.scatterplot, x='member_age_2019', y='duration_sec', hue='member_gender');
g.add_legend();
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between stday_name, member_gender, member_age_2019, and duration_sec.
g = sb.FacetGrid(fgb, col='stday_name', col_wrap=1, height=5, aspect=2)
g.map_dataframe(sb.pointplot, x='member_age_2019', y='duration_sec', hue='member_gender');
plt.ylim(0,20000)
g.add_legend();
g.set_xlabels('Member Age in 2019')
g.set_ylabels('Duration in Seconds');
Observed:
# The relation between stday_name, member_gender, member_age_2019, and bike_share_for_all_trip.
g = sb.FacetGrid(fgb_samp, col='stday_name', col_wrap=4)
g.map_dataframe(sb.boxplot, x='member_gender', y='member_age_2019', hue='bike_share_for_all_trip');
g.add_legend();
g.set_xlabels('Member Gender')
g.set_ylabels('Member Age in 2019');
Observed:
# The relation between stday_name, user_type, duration_sec, and bike_share_for_all_trip.
g = sb.FacetGrid(fgb, col='stday_name', col_wrap=4)
g.map_dataframe(sb.boxplot, x='user_type', y='duration_sec', hue='bike_share_for_all_trip');
g.add_legend();
g.set_xlabels('User Type')
g.set_ylabels('Duration in Seconds');
Observed
# The relation between stday_name, user_type, sthour, and member_gender.
g = sb.FacetGrid(fgb, col='stday_name', col_wrap=4)
g.map_dataframe(sb.boxplot, x='user_type', y='sthour', hue='member_gender');
g.add_legend();
g.set_xlabels('User Type')
g.set_ylabels('Start Hour');
Observed